import warnings
warnings.filterwarnings('ignore')
import sys
import scipy
import numpy
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import os
from scipy import stats as st
pd.options.display.max_columns = None
from scipy.stats import norm, skew, kurtosis
from sklearn.preprocessing import StandardScaler
pd.set_option('display.float_format', lambda x: '{:.5f}'.format(x)) #Limiting floats output to 3 decimal points
train = pd.read_csv('C:/Users/Mrinal/Desktop/house pricing/train.csv')
test = pd.read_csv('C:/Users/Mrinal/Desktop/house pricing/test.csv')
y_train = train['SalePrice']
#Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']
#Now drop the 'Id' colum since it's unnecessary for the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)
train.shape, test.shape, y_train.shape
train.info()
train.describe()
train.sample(5)
test.sample(5)
train.columns
train['SalePrice'].describe()
train.groupby('SaleType').count()
numeric_features=train.select_dtypes(include=[np.number])
categorical_features=train.select_dtypes(include=[np.object])
numeric_features.columns
categorical_features.columns
plt.figure(1); plt.title('Normal')
sns.distplot(train['SalePrice'])
plt.figure(2); plt.title('Normal')
sns.distplot(train['SalePrice'], kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(train['SalePrice'], kde=False, fit=st.lognorm)
#skewness and kurtosis
train['SalePrice'].skew(), train['SalePrice'].kurt()
train['SalePrice1'] = np.log(train['SalePrice'])
train['SalePrice1'].skew(), train['SalePrice1'].kurt()
sns.FacetGrid(train, hue="OverallQual", size=7) \
.map(plt.scatter, "OverallQual", "SalePrice") \
.add_legend()
plt.show()
ax= sns.boxplot(x="OverallQual", y="SalePrice", data=train)
ax= sns.stripplot(x="OverallQual", y="SalePrice", data=train, jitter=True, edgecolor="black")
plt.show()
sns.barplot(train.OverallQual,train.SalePrice)
sns.FacetGrid(train, hue="OverallCond", size=5) \
.map(plt.scatter, "OverallCond", "SalePrice") \
.add_legend()
plt.show()
ax= sns.boxplot(x="OverallCond", y="SalePrice", data=train)
ax= sns.stripplot(x="OverallCond", y="SalePrice", data=train, jitter=True, edgecolor="black")
plt.show()
sns.barplot(train.OverallCond,train.SalePrice)
train.hist(figsize=(15,15))
plt.figure()
correlation = numeric_features.corr()
print(correlation['SalePrice'].sort_values(ascending = False))
sns.set()
columns = ['SalePrice','OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF','1stFlrSF','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd']
sns.pairplot(train[columns],size = 2 ,kind ='scatter')
plt.show()
sns.FacetGrid(train[columns], hue="OverallQual", size=5).map(sns.kdeplot, "YearBuilt").add_legend()
plt.show()
sns.FacetGrid(train[columns], hue="OverallQual", size=5).map(sns.kdeplot, "SalePrice").add_legend()
plt.show()
sns.jointplot(x="OverallQual", y="SalePrice", data=train, size=8,ratio=8, kind='hex',color='red')
plt.show()
from pandas.tools.plotting import andrews_curves
andrews_curves(train[columns], "YearBuilt",colormap='gist_rainbow')
plt.show()
sns.jointplot(x="SalePrice", y="YearBuilt", data=train, size=7, kind='kde', color='#269DC4', space=0)
sns.jointplot(x="SalePrice", y="OverallQual", data=train, size=7, kind='kde', color='#46C426', space=0)
sns.jointplot(x="YearBuilt", y="OverallQual", data=train, size=7, kind='kde', color='#A62668', space=0)
sns.jointplot(x="OverallCond", y="OverallQual", data=train, size=7, kind='kde', color='#D47B22', space=0)
from pandas.tools.plotting import radviz
radviz(train[columns], "OverallQual")
corrmat = train.corr()
plt.subplots(figsize=(40, 30))
sns.heatmap(corrmat, vmax=1, annot=True)
sns.set(font_scale=1.8);
zoomedCorrelation = correlation.loc[columns,columns]
f , ax = plt.subplots(figsize = (14,12))
plt.title('Correlation of numeric features',size=15)
sns.heatmap(zoomedCorrelation, square = True, linewidths=0.01, vmax=1, annot=True,cmap='plasma',
linecolor="black", annot_kws = {'size':15})
sns.set(font_scale=1.25)
corrmat=train.corr()
cols = corrmat.nlargest(10, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.heatmap(cm, annot=True, square=True,
fmt='.1f', annot_kws={'size':13},
yticklabels=cols.values,
xticklabels=cols.values)
categorical_features.Functional.value_counts()
sns.violinplot(data=train,x="Functional", y="SalePrice")
categorical_features.Neighborhood.value_counts()
plt.subplots(figsize=(50, 30))
sns.violinplot(data=train,x="Neighborhood", y="SalePrice")
sns.set(font_scale=2.5);
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.shape
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
# columns to be dropped
columnsToDrop = missing_data[missing_data['Missing Ratio']>60].index
all_data = all_data.drop(columnsToDrop, axis=1)
print(all_data.shape)
for col in ('FireplaceQu','MSSubClass','MasVnrType','BsmtQual', 'BsmtCond', 'BsmtExposure',
'BsmtFinType1', 'BsmtFinType2','GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
'Fence','Alley','MiscFeature','PoolQC'):
if col in all_data.columns:
all_data[col] = all_data[col].fillna('None')
for col in ('GarageArea', 'GarageCars','BsmtFinSF1',
'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath','MasVnrArea'):
if col in all_data.columns:
all_data[col] = all_data[col].fillna(0)
all_data.Functional.value_counts(),all_data.Utilities.value_counts()
for col in ('MSZoning','Electrical','KitchenQual','Exterior1st','Exterior2nd','SaleType','Functional','Utilities'):
if col in all_data.columns:
all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
all_data.Functional.value_counts(),all_data.Utilities.value_counts()
# Impute the missing values of 'GarageYrBlt' based on the median of 'YearBuilt'
all_data['YearBuiltCut'] = pd.qcut(all_data.YearBuilt,10)
all_data['GarageYrBlt']= all_data.groupby(['YearBuiltCut'])['GarageYrBlt'].transform(lambda x : x.fillna(x.median()))
all_data['GarageYrBlt'] = all_data['GarageYrBlt'].astype(int)
all_data.drop('YearBuiltCut',axis=1,inplace=True)
# Impute the missing values of 'LotFrontage' based on the median of 'Neighborhood'
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
lambda x: x.fillna(x.median()))
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head()
sns.set(font_scale=1.4)
corrmat=train.corr()
cols = corrmat.nsmallest(10, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.heatmap(cm, annot=True, square=True,
fmt='.1f', annot_kws={'size':10},
yticklabels=cols.values,
xticklabels=cols.values)
Uncor = ['EnclosedPorch',
'LowQualFinSF',
'MiscVal', 'BsmtHalfBath', 'BsmtFinSF2']
all_data.drop(Uncor, axis=1, inplace=True)
all_data.info()
all_data.shape
fig, ax = plt.subplots()
ax.scatter(x = all_data['GrLivArea'], y = all_data['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
tr=all_data[:1460]
ts=all_data[1460:]
ts.drop('SalePrice', axis=1,inplace=True)
tr.shape,ts.shape
fig, ax = plt.subplots()
ax.scatter(x = tr['GrLivArea'], y = tr['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
#dropping outliers of GrLivArea as it is evidently out of the trend
tr = tr.drop(tr[(tr['GrLivArea']>4000) & (tr['SalePrice']<300000)].index)
fig, ax = plt.subplots()
ax.scatter(x = tr['GrLivArea'], y = tr['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
tr.head()
#visually checking for outliers
from pandas.api.types import is_numeric_dtype
for name in list(tr.columns):
if is_numeric_dtype(tr[name]):
data = pd.concat([tr['SalePrice'], tr[name]], axis=1)
data.plot.scatter(x=name, y='SalePrice', ylim=(0,800000))
#3removing outliers after visual inspection
tr = tr.drop(tr[tr['1stFlrSF'] > 2750].index)
tr = tr.drop(tr[tr['3SsnPorch'] > 500].index)
tr = tr.drop(tr[tr['BedroomAbvGr'] > 7].index)
tr = tr.drop(tr[tr['BsmtFullBath'] > 2.5].index)
tr = tr.drop(tr[tr['GarageCars'] > 3].index)
tr = tr.drop(tr[(tr['GarageArea'] > 1200) & (tr['SalePrice'] < 300000)].index)
tr = tr.drop(tr[tr['LotArea'] > 20000].index)
tr = tr.drop(tr[tr['LotFrontage'] > 300].index)
tr = tr.drop(tr[tr['MasVnrArea'] > 1500].index)
tr = tr.drop(tr[(tr['OpenPorchSF'] > 500) & (tr['SalePrice'] < 100000)].index)
tr = tr.drop(tr[(tr['LotFrontage'] > 150) & (tr['SalePrice'] < 100000)].index)
for name in list(tr.columns):
if is_numeric_dtype(tr[name]):
data = pd.concat([tr['SalePrice'], tr[name]], axis=1)
data.plot.scatter(x=name, y='SalePrice', ylim=(0,800000))
#Exclude outlier in numeric dtype under 0.05 and over 0.95 quantile
from pandas.api.types import is_numeric_dtype
def remove_outlier(tr):
low = .05
high = .95
quant_df = tr.quantile([low, high])
for name in list(tr.columns):
if is_numeric_dtype(tr[name]):
tr = tr[(tr[name] > quant_df.loc[low, name]) & (tr[name] < quant_df.loc[high, name])]
return tr
remove_outlier(tr).head()
tr.shape, ts.shape
all_data_copy= all_data
all_data = pd.concat((tr, ts)).reset_index(drop=True)
all_data.shape
#total square footage
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
numeric_all = all_data.dtypes[(all_data.dtypes != "object") & (all_data.columns!='SalePrice')& (all_data.columns!='SalePrice1')].index
# Check the skew of all numerical features
skewed_feats = all_data[numeric_all].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(20)
# box cox transform skewed vairables which have skew > 0.8
skewness = skewness[abs(skewness) > 0.8]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))
from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
#all_data[feat] += 1
all_data[feat] = boxcox1p(all_data[feat], lam)
all_data.columns
# Converting numeric features to categorical features
strCols = ['YrSold','MoSold','MSSubClass','OverallCond']
for i in strCols:
all_data[i]=all_data[i].astype(str)
all_data[all_data.dtypes[all_data.dtypes != "numeric"].index].columns
all_data.ExterQual.value_counts()
#LabelEncoder to categorical features : these are caregorical features which are ordinal i.e. their categories are ranks
from sklearn.preprocessing import LabelEncoder
cols = ( 'BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtQual','CentralAir',
'ExterCond','ExterQual','FireplaceQu','Functional','GarageCond','GarageFinish','GarageQual','HeatingQC',
'KitchenQual','LandSlope','LotShape','MoSold','MSSubClass','OverallCond','PavedDrive','Street','YrSold')
for c in cols:
lbl = LabelEncoder()
lbl.fit(list(all_data[c].values))
all_data[c] = lbl.transform(list(all_data[c].values))
print('Shape all_data: {}'.format(all_data.shape))
all_data.ExterQual.value_counts()
#get dummy variables
all_data = pd.get_dummies(all_data)
print(all_data.shape)
all_data['SalePrice']=all_data['SalePrice1']
all_data.drop('SalePrice1',axis=1, inplace=True)
print(all_data.shape)
# split back into test & train
X_train = all_data[all_data['SalePrice'].isna()==False]
y_train_T = X_train[all_data['SalePrice'].isna()==False]['SalePrice']
X_train.drop(columns = 'SalePrice', inplace=True)
X_test = all_data[all_data['SalePrice'].isna()!=False]
X_test.drop(columns = 'SalePrice', inplace=True)
X_train.shape, y_train_T.shape, X_test.shape
#confirm lognormal
plt.figure(3); plt.title('Log Normal')
sns.distplot(y_train_T, kde=False, fit=st.lognorm)
# Using Robust Scaler to transform X_train
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
X_train_scaled = robust_scaler.fit(X_train).transform(X_train)
X_test_scaled = robust_scaler.transform(X_test)
#Feature importance using lasso
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.001)
lasso.fit(X_train_scaled,y_train_T)
y_pred_lasso = lasso.predict(X_test_scaled)
lassoCoeff = pd.DataFrame({"Feature Importance":lasso.coef_}, index=all_data.drop(columns = 'SalePrice').columns)
lassoCoeff.sort_values("Feature Importance",ascending=False)
lassoCoeff[lassoCoeff["Feature Importance"]!=0].sort_values("Feature Importance").plot(kind="barh",figsize=(20,35),fontsize= 20)
# Principal Component Analysis of data such that 95% of the variance is retained
from sklearn.decomposition import PCA
data = np.concatenate([X_train_scaled,X_test_scaled])
pca = PCA(0.95)
data = pca.fit_transform(data)
varPCA = np.round(pca.explained_variance_ratio_*100, decimals = 1)
print(varPCA)
# Principal Component Analysis plot of the data
plt.figure(figsize=(16,12))
plt.bar(x=range(1,len(varPCA)+1), height = varPCA)
plt.ylabel("Explained Variance (%)", size = 15)
plt.xlabel("Principle Components", size = 15)
plt.title("Principle Component Analysis Plot : Training Data", size = 15)
plt.show()
X_train.shape, y_train_T.shape, X_test.shape, X_train.shape[0]
# Shape of final data we will be working on
X_train_scaled = data[:X_train.shape[0]]
X_test_scaled = data[X_train.shape[0]:]
X_train_scaled.shape, y_train_T.shape, X_test_scaled.shape
# importing the models
from sklearn.linear_model import LinearRegression, BayesianRidge, ElasticNet, Lasso, SGDRegressor, Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.svm import LinearSVR,SVR
import xgboost as xgb
# creating the models
models = [
LinearRegression(),
SVR(),
SGDRegressor(),
SGDRegressor(max_iter=1000, tol = 1e-3),
RandomForestRegressor(),
Lasso(),
Lasso(alpha=0.01,max_iter=10000),
Ridge(),
BayesianRidge(),
KernelRidge(),
KernelRidge(alpha=0.6,kernel='polynomial',degree = 2,coef0=2.5),
ElasticNet(),
ElasticNet(alpha = 0.001,max_iter=10000),
ExtraTreesRegressor(),
GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
max_depth=4, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10,
loss='huber', random_state =5),
xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
learning_rate=0.05, max_depth=3,
min_child_weight=1.7817, n_estimators=2200,
reg_alpha=0.4640, reg_lambda=0.8571,
subsample=0.5213, silent=1,
random_state =7, nthread = -1),
]
names = ['Linear regression','Support vector regression','Stochastic gradient descent',
'Stochastic gradient descent 2','Gradient boosting tree','Random forest',
'Lasso regression','Lasso regression 2','Ridge regression','Bayesian ridge regression',
'Kernel ridge regression','Kernel ridge regression 2','Elastic net regularization',
'Elastic net regularization 2','Extra trees regression','Gradient Boosting','XGBoost']
# Define a root mean square error function
def rmse(model,X,y):
rmse = np.sqrt(-cross_val_score(model,X,y,scoring="neg_mean_squared_error",cv=5))
return rmse
from sklearn.model_selection import KFold,cross_val_score
warnings.filterwarnings('ignore')
# Perform 5-folds cross-calidation to evaluate the models
for model, name in zip(models, names):
# Root mean square error
score = rmse(model,X_train_scaled,y_train_T)
print("- {} : mean : {:.6f}, std : {:4f}".format(name, score.mean(),score.std()))
import xgboost as xgb
xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
learning_rate=0.05, max_depth=3,
min_child_weight=1.7817, n_estimators=2200,
reg_alpha=0.4640, reg_lambda=0.8571,
subsample=0.5213, silent=1,
random_state =7, nthread = -1)
score = rmse(model,X_train_scaled,y_train_T)
print("- {} : mean : {:.6f}, std : {:4f}".format('XGBoost', score.mean(),score.std()))
#GridSearch Cross Validation of all models
from sklearn.model_selection import GridSearchCV
class gridSearch():
def __init__(self,model):
self.model = model
def grid_get(self,param_grid):
grid_search = GridSearchCV(self.model,param_grid,cv=5,scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled,y_train_T)
grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
print('\nBest parameters : {}, best score : {}'.format(grid_search.best_params_,np.sqrt(-grid_search.best_score_)))
gridSearch(SVR()).grid_get(
{'C':[13,15,17,19,21],'kernel':["rbf"],"gamma":[0.0005,0.001,0.002,0.01],"epsilon":[0.01,0.02,0.03,0.1]})
gridSearch(Lasso()).grid_get(
{'alpha':[0.01,0.001,0.0001,0.0002,0.0003,0.0004,0.0005,0.0006,0.0007,0.0008,0.0009],'max_iter':[10000]})
gridSearch(Ridge()).grid_get(
{'alpha':[10,20,25,30,35,40,45,50,55,57,60,65,70,75,80,100],'max_iter':[10000]})
gridSearch(KernelRidge()).grid_get(
{'alpha':[3.5,4,4.5,5,5.5,6,6.5], 'kernel':["polynomial"], 'degree':[3],'coef0':[1,1.5,2,2.5,3,3.5]})
gridSearch(ElasticNet()).grid_get(
{'alpha':[0.006,0.0065,0.007,0.0075,0.008],'l1_ratio':[0.070,0.075,0.080,0.085,0.09,0.095],'max_iter':[10000]})
linreg = LinearRegression()
svr = SVR(C = 13, epsilon= 0.02, gamma = 0.001, kernel = 'rbf')
lasso = Lasso(alpha= 0.0007, max_iter= 10000)
ridge = Ridge(alpha=30, max_iter= 10000)
kerridge = KernelRidge(alpha=6.5 ,kernel='polynomial', degree=3 , coef0=3)
enet = ElasticNet(alpha=0.008,l1_ratio=0.07,max_iter=10000)
bayridge = BayesianRidge()
gb = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
max_depth=4, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10,
loss='huber', random_state =5)
xgb=xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
learning_rate=0.05, max_depth=3,
min_child_weight=1.7817, n_estimators=2200,
reg_alpha=0.4640, reg_lambda=0.8571,
subsample=0.5213, silent=1,
random_state =7, nthread = -1)
#Average Models
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
# creating clones of the original models
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
# fitting our data to the models
for model in self.models_:
model.fit(X, y)
return self
#making predictions on our fitted models and averaging them
def predict(self, X):
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
return np.mean(predictions, axis=1)
models=[linreg,svr,lasso,ridge,kerridge,enet,bayridge,gb,xgb]
names=['linreg','svr','lasso','ridge','kerridge','enet','bayridge','gb','xgb']
models1=[svr,lasso,ridge,kerridge,enet,bayridge]
names1=['svr','lasso','ridge','kerridge','enet','bayridge']
d = dict(zip(names,models))
d1= dict(zip(names1,models1))
#check for combinations
import itertools
i=0
for L in range(3, 4):
for subset in itertools.combinations(d, L):
print (subset)
averaged_model = AveragingModels(models = (lasso,enet, gb))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, gb))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model = AveragingModels(models = (ridge,enet, gb))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, gb))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model = AveragingModels(models = (kerridge,enet, gb))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, gb))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model = AveragingModels(models = (bayridge,enet, gb))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, gb))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model = AveragingModels(models = (lasso,enet, xgb))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, xgb))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model = AveragingModels(models = (ridge,enet, xgb))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, xgb))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model = AveragingModels(models = (kerridge,enet, xgb))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, xgb))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model = AveragingModels(models = (bayridge,enet, xgb))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, xgb))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model = AveragingModels(models = (kerridge,enet, xgb,svr))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, xgb,svr))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model = AveragingModels(models = (enet, xgb,svr))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, xgb,svr))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model = AveragingModels(models = (kerridge, xgb,svr))
score = rmse(averaged_model,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, xgb,svr))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
models = [linreg,svr,lasso,ridge,kerridge,bayridge,enet,gb,xgb]
names = ['Linear regression','Support vector regression','Lasso regression','Ridge regression','Kernel ridge regression','Bayesian ridge regression',
'Elastic net regularization','Gradient Boosting','XGBoost']
warnings.filterwarnings('ignore')
# Perform 5-folds cross-calidation to evaluate the models
for model, name in zip(models, names):
# Root mean square error
score = rmse(model,X_train_scaled,y_train_T)
print("- {} : mean : {:.6f}, std : {:4f}".format(name, score.mean(),score.std()))
averaged_model1 = AveragingModels(models = (kerridge, svr))
score = rmse(averaged_model1,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, xgb,svr))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model2 = AveragingModels(models = (kerridge, svr,enet))
score = rmse(averaged_model2,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, xgb,svr))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
averaged_model3 = AveragingModels(models = ( svr,enet))
score = rmse(averaged_model3,X_train_scaled, y_train_T)
print(" Averaged base maveraged_models = AveragingModels(models = (lasso,enet, xgb,svr))odels score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
#fit
k_svr = averaged_model1.fit(X_train_scaled,y_train_T)
kerridge_svr_enet = averaged_model2.fit(X_train_scaled,y_train_T)
svr_enet = averaged_model3.fit(X_train_scaled,y_train_T)
svr1 = svr.fit(X_train_scaled,y_train_T)
kerridge1 = kerridge.fit(X_train_scaled,y_train_T)
enet1 = enet.fit(X_train_scaled,y_train_T)
#predict
avg_pred1_k_svr = np.expm1(k_svr.predict(X_test_scaled))
avg_pred_kerridge_svr_enet = np.expm1(kerridge_svr_enet.predict(X_test_scaled))
avg_pred_svr_enet = np.expm1(svr_enet.predict(X_test_scaled))
avg_pred_svr = np.expm1(svr.predict(X_test_scaled))
avg_pred_kerridge = np.expm1(kerridge.predict(X_test_scaled))
avg_pred_enet = np.expm1(enet.predict(X_test_scaled))
X_test_scaled.shape, test_ID.count()
p=[avg_pred1_k_svr, avg_pred_kerridge_svr_enet, avg_pred_svr_enet, avg_pred_svr, avg_pred_kerridge,avg_pred_enet]
n=['KRR_SVR','KRR_SVR_ENET','SVR_ENET','SVR','KRR','ENET']
for model, name in zip(p,n):
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = model
sub.to_csv(('FINALsub_(%s).csv' % name),index=False)